
#include <sys/cdefs.h>
#include <cstdio>
#include <cstdlib>
#include <cassert>
typedef double doublev4 __attribute__((mode(V4DF)));
template <int IDZ>
constexpr int const_abs() {
  return (IDZ < 0 ? -IDZ : IDZ);
}
__always_inline void print_doublev4(doublev4 v) {
  double *a = (double *)&v;
  printf("%f %f %f %f\n", a[0], a[1], a[2], a[3]);
}
template <int OFF, int MODE>
struct unaligned {
  __always_inline static doublev4 load(double *q) {
    assert(0);
  }
};
template <int OFF>
struct unaligned<OFF, 0> {
  __always_inline static doublev4 load(double *q) {
    return __builtin_sw_vldd_u_f(q + OFF);
  }
};
template <int OFF>
struct unaligned<OFF, 2> {
  __always_inline static doublev4 load(double *q) {
    doublev4 lov4 = unaligned<OFF - 2, 0>::load(q);
    doublev4 hiv4 = unaligned<OFF + 2, 0>::load(q);
    return __builtin_sw_slave_vshuffled(hiv4, lov4, 0x4e);
  }
};
template <int OFF>
struct unaligned<OFF, 1> {
  __always_inline static doublev4 load(double *q) {
    doublev4 lov4 = unaligned<OFF - 1, 0>::load(q);
    doublev4 hiv4 = unaligned<OFF + 1, 2>::load(q);
    return __builtin_sw_slave_vshuffled(hiv4, lov4, 0x99);
  }
};
template <int OFF>
struct unaligned<OFF, 3> {
  __always_inline static doublev4 load(double *q) {
    doublev4 lov4 = unaligned<OFF - 1, 2>::load(q);
    doublev4 hiv4 = unaligned<OFF + 1, 0>::load(q);
    return __builtin_sw_slave_vshuffled(hiv4, lov4, 0x99);
  }
};
template <int OFF>
__always_inline doublev4 load_unaligned(double *q) {
  return unaligned<OFF, (OFF % 4 + 4) % 4>::load(q);
}
template <int OFF>
__always_inline doublev4 load_expand(double *q) {
  doublev4 v = __builtin_sw_vldd_f(q + (OFF & ~3));
  if ((OFF & 3) == 0)
    return __builtin_sw_slave_vshuffled(v, v, 0x00);
  else if ((OFF & 3) == 1)
    return __builtin_sw_slave_vshuffled(v, v, 0x55);
  else if ((OFF & 3) == 2)
    return __builtin_sw_slave_vshuffled(v, v, 0xaa);
  else if ((OFF & 3) == 3)
    return __builtin_sw_slave_vshuffled(v, v, 0xff);
}
template <int NDZ, int IDZ, int END>
struct unrolled_loop {

  __always_inline static void run(doublev4 &e0v4, doublev4 &e1v4, doublev4 &e2v4, doublev4 &e3v4, int k, double *q, double *gdirect) {
    doublev4 q0v4 = load_unaligned<IDZ>(q + k);
    doublev4 q1v4 = load_unaligned<IDZ+4>(q + k);
    doublev4 q2v4 = load_unaligned<IDZ+8>(q + k);
    doublev4 q3v4 = load_unaligned<IDZ+12>(q + k);
    doublev4 gdv4 = __builtin_sw_ldde(gdirect + const_abs<IDZ>());
    // doublev4 gdv4 = load_expand<const_abs<IDZ>()>(gdirect);
    e0v4 += q0v4 * gdv4;
    e1v4 += q1v4 * gdv4;
    e2v4 += q2v4 * gdv4;
    e3v4 += q3v4 * gdv4;
    unrolled_loop<NDZ, IDZ + 1, (IDZ >= NDZ)>::run(e0v4, e1v4, e2v4, e3v4, k, q, gdirect);
  }
};
template <int NDZ, int IDZ>
struct unrolled_loop<NDZ, IDZ, 1> {

  __always_inline static void run(doublev4 &e0v4, doublev4 &e1v4, doublev4 &e2v4, doublev4 &e3v4, int k, double *q, double *gdirect) {
    // puts(__PRETTY_FUNCTION__);
  }
};
const int ndz = 13;
int main(int argc, char **argv) {
  int nk = 64;
  if (argc > 1)
    nk = atoi(argv[1]);
  double e[64], etest[64];
  double q[64 + 32];
  double g[20];
  double *q_off = q + 16;
  for (int i = 0; i < 64; i++) {
    e[i] = 0;
    etest[i] = 0;
  }
  for (int i = 0; i < 96; i++) {
    q[i] = i;
  }
  for (int i = 0; i < 20; i++) {
    g[i] = i;
    if (i > ndz)
      g[i] = 0;
  }
  asm volatile("" ::
                   : "memory");
  for (int k = 0; k < nk; k++) {
    for (int dz = -ndz; dz < 0; dz++) {
      e[k] += g[-dz] * q_off[k + dz];
    }
    for (int dz = 0; dz <= ndz; dz++) {
      e[k] += g[dz] * q_off[k + dz];
    }
  }
  long st, ed;
  asm volatile("rcsr %0, 4\n\t"
               : "=r"(st));
  for (int i = 0; i < 100; i++) {
    for (int k = 0; k < nk; k += 16) {
      doublev4 e0v4 = __builtin_sw_vldd_f(etest + k);
      doublev4 e1v4 = __builtin_sw_vldd_f(etest + k + 4);
      doublev4 e2v4 = __builtin_sw_vldd_f(etest + k + 8);
      doublev4 e3v4 = __builtin_sw_vldd_f(etest + k + 12);
      unrolled_loop<13, -13, 0>::run(e0v4, e1v4, e2v4, e3v4, k, q_off, g);
      __builtin_sw_vstd_f(etest + k, e0v4);
      __builtin_sw_vstd_f(etest + k + 4, e1v4);
      __builtin_sw_vstd_f(etest + k + 8, e2v4);
      __builtin_sw_vstd_f(etest + k + 12, e3v4);
    }
  }
  asm volatile("rcsr %0, 4\n\t"
               : "=r"(ed));
  for (int i = 0; i < nk; i++) {
    printf("%f %f %f\n", e[i], etest[i], e[i] * 100 - etest[i]);
  }
  int eff_flops = (ndz * 2 + 1) * nk * 2;
  printf("%ld %ld\n", ed - st, eff_flops);
}